{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 数据清洗"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "#读取数据库数据\n",
    "import pandas as pd\n",
    "from pandas import DataFrame as df\n",
    "Comment_data = pd.read_csv('data/JDComment_data.csv')  # 读取训练数据"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Comment_data[Comment_data['评论内容'] == '此用户未填写评价内容' ]#查看评论前几行数据"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "填充缺省值以及删除第一行"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "Comment_data.drop(index=0,axis=0) #删除第一行\n",
    "Comment_data.isnull().any()#查看哪些列存在空值"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#Comment_data[Comment_data.isnull().values==True]\n",
    "for col in Comment_data:\n",
    "    if col == \"会员级别\" or col == \"评论内容\":\n",
    "        Comment_data[col].fillna('', inplace=True)\n",
    "    elif col == \"用户ID\":\n",
    "        Comment_data[col].fillna('00000000000', inplace=True)\n",
    "    else:\n",
    "        Comment_data[col].fillna(0, inplace=True)\n",
    "Comment_data.isnull().any()#查看是否进行了填充"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "(DataFrame筛选数据与loc用法)[https://blog.csdn.net/junbujianwpl/article/details/70473659]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "#Comment_data[Comment_data.评论内容.duplicated()].count()\n",
    "#print(Comment_data[Comment_data.评论内容.duplicated()].count())\t# 最后两项重复\n",
    "#print(len(Comment_data[Comment_data.评论内容 == '此用户未填写评价内容']))\n",
    "Comment_data[Comment_data['评论内容']==\"此用户未填写评价内容\"] = \"\"#清空未评价信息\n",
    "print(len(Comment_data[Comment_data['评论内容']!=\"\"])) #统计有效评论数"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 数据可视化及分析\n",
    "### 会员等级编码"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from numpy import array\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "from sklearn import preprocessing\n",
    "# 会员等级分类\n",
    "data = Comment_data[Comment_data['会员级别']!='']\n",
    "value1 = array(data['会员级别'])\n",
    "phone_type = Comment_data[Comment_data['手机型号']!='']\n",
    "value2 = array(data['手机型号'])\n",
    "#print(values)\n",
    "# 手机型号分类\n",
    "#整数编码\n",
    "level_encoder = LabelEncoder()\n",
    "phone_type_encode = LabelEncoder()\n",
    "integer_level_encoded = level_encoder.fit_transform(value1)\n",
    "integer_type_encoded = phone_type_encode.fit_transform(value2)\n",
    "#print(label_encoder.get_params(False))\n",
    "print(integer_level_encoded)\n",
    "userLevel = level_encoder.classes_\n",
    "print(level_encoder.classes_)\n",
    "print(integer_type_encoded)\n",
    "phone_type = phone_type_encode.classes_\n",
    "print(phone_type_encode.classes_)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 购买时间处理"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "hourList = []\n",
    "month = []\n",
    "year = []\n",
    "useLessData = 0\n",
    "for i in Comment_data['购买时间']:\n",
    "    try:\n",
    "        #print(str(i))\n",
    "        hourList.append(str(i).split(' ')[1].split(':')[0])\n",
    "        month.append(str(i).split(' ')[0].split('-')[1])\n",
    "        year.append(str(i).split(' ')[0].split('-')[0])\n",
    "    except:\n",
    "        useLessData += 1\n",
    "print('无效数据有%s条'%useLessData)\n",
    "phone_year_encode = LabelEncoder()\n",
    "year_encoded = phone_year_encode.fit_transform(year)\n",
    "print(phone_year_encode.classes_)\n",
    "print(year_encoded)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import random\n",
    "from pyecharts import Bar\n",
    "attr = [\"{}时\".format(i) for i in range(24)]\n",
    "v1 = [hourList.count(str(_).rjust(2,'0')) for _ in range(24)]\n",
    "bar = Bar(\"手机购买时段\",title_color='#FF0000',background_color='#7EC0EE')\n",
    "bar.add(\"\", attr, v1, is_label_show=True, is_datazoom_show=True)\n",
    "bar"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "分析一天中不同时段消费者的手机购买情况，可以分析得出，大部分消费者在午时（10-12时）和晚上（20-22时）出现了消费高峰期，在此时段顾客购买商品的概率更大，他们浏览商品的机会更多"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "\n",
    "v = [[]]\n",
    "for level in level_encoder.classes_:\n",
    "    print(level)\n",
    "    df = Comment_data[Comment_data['会员级别'] == level]\n",
    "    hourList = []\n",
    "    month = []\n",
    "    for i in df['购买时间']:\n",
    "        try:\n",
    "            #print(str(i).split(' ')[0].split('-')[1])\n",
    "            month.append(str(i).split(' ')[0].split('-')[1])\n",
    "        except:\n",
    "            useLessData += 1\n",
    "    print(useLessData)\n",
    "    data = []\n",
    "    for _ in range(1, 13):\n",
    "        if(_ < 10):\n",
    "            data.append(month.count('0'+str(_)))\n",
    "        else:\n",
    "            data.append(month.count(str(_)))\n",
    "    #print(data)\n",
    "    v.append(data)\n",
    "print(v)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyecharts import Line\n",
    "import numpy as np\n",
    "\n",
    "attr = [\"一月\", \"二月\", \"三月\", \"四月\", \"五月\", \"六月\",'七月','八月','九月','十月','十一月','十二月']\n",
    "#plus = Comment_data[Comment_data['会员级别']=='PLUS会员']\n",
    "#v1 = [month.count(str(_)) for _ in range(0, 12)]\n",
    "#v2 = [55, 60, 16, 20, 15, 80]\n",
    "line = Line(\"月消费与会员等级分析\",title_color='#20B2AA',background_color='#F0F8FF')\n",
    "count = 1\n",
    "saleSum = 0.0\n",
    "for level in level_encoder.classes_:\n",
    "    line.add(level, attr, v[count], mark_point=[\"average\"], legend_pos ='right',legend_top ='7%')\n",
    "    saleSum += np.mean(v[count])\n",
    "    count += 1\n",
    "count = 1\n",
    "for level in level_encoder.classes_:\n",
    "    print(\"%s所占比例%.2lf%%\"%(level, np.mean(v[count])*100 / saleSum))\n",
    "    count += 1\n",
    "    \n",
    "#line.add(\"PLUS会员\", attr, v1, mark_point=[\"average\"])\n",
    "\n",
    "#line.add(\"商家B\", attr, v2, is_smooth=True, mark_line=[\"max\", \"average\"])\n",
    "line"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "通过分析上图可知，不论是哪种会员，他们在三月份前后以及十二月份前后购买手机的可能性最大，这可能大家新的一年想换一个新手机的想法有关，在这些月份进行手机推荐可能会有不错的收获。并且分析不同会员的购买情况可知，他们购买的频次降序排列为PLUS会员》金牌会员》银牌会员=钻石会员》PLUS会员（试用）》企业会员。\n",
    "分析原因，大致是因为PLUS会员大部分都属于高消费群体，他们购买这些价值昂贵的手机概率更大。因此，大致可以推断，这款手机的主要消费对象是京东商城的PLUS会员（31.00%）、金牌会员（21.87%）、银牌会员（18.52%）。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator\n",
    " \n",
    "data = Comment_data['评论内容'].tolist()\n",
    "dataStr = ','.join(data)\n",
    "#print(','.join(data))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "注:此处可能因评论数据量太大，出现数据溢出问题，解决方案参照[处理Jupyter Notebook报错：IOPub data rate exceeded](https://blog.csdn.net/LaoChengZier/article/details/80705298)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "from wordcloud import WordCloud,STOPWORDS,ImageColorGenerator\n",
    " \n",
    "backgroup_Image = plt.imread('phone.jpg') #笼罩图\n",
    " \n",
    "#f = open('人工智能.txt','r').read()  #生成词云的文档\n",
    "wordcloud = WordCloud(\n",
    "        background_color = 'white', #背景颜色，根据图片背景设置，默认为黑色\n",
    "        mask = backgroup_Image, #笼罩图\n",
    "        font_path = 'C:\\Windows\\Fonts\\STZHONGS.TTF',#若有中文需要设置才会显示中文\n",
    "        width = 1000,\n",
    "        height = 1200,\n",
    "        margin = 2).generate(dataStr) # generate 可以对全部文本进行自动分词\n",
    "#参数 width，height，margin分别对应宽度像素，长度像素，边缘空白处\n",
    " \n",
    "plt.imshow(wordcloud)\n",
    "plt.axis('off')\n",
    "plt.show()\n",
    " \n",
    "#保存图片：默认为此代码保存的路径\n",
    "wordcloud.to_file('phoneComment.jpg')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "从图中可以看出，存在一些无效数据，没有处理，比如hellip就是无效数据。并且标点符号的影响使得一些评论次数比较多的句子\n",
    "显示了出来，这显然不太符合预期的词图效果。\n",
    "下面去在去除一些冗余的数据之后，重新进行分词并生存词云"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataStr = dataStr.replace('hellip','')\n",
    "dataStr = dataStr.replace('。','')\n",
    "dataStr = dataStr.replace('，','').replace('！','').replace(',','').replace('&','').replace('...','')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# -*- coding: utf-8 -*-\n",
    "from wordcloud import WordCloud\n",
    "import matplotlib.pyplot as plt\n",
    "import jieba\n",
    "import os\n",
    "backgroup_Image = plt.imread('picture/phone.jpg') #笼罩图\n",
    "wordlist = jieba.cut(dataStr, cut_all=False)\n",
    "word_string = \" \".join(wordlist)\n",
    "wordcloud = WordCloud(font_path='front/simsun.ttc', background_color=\"white\",mask = backgroup_Image, width=1000, height=860, margin=2).generate(word_string)\n",
    "import matplotlib.pyplot as plt\n",
    "plt.imshow(wordcloud)\n",
    "plt.axis(\"off\")\n",
    "plt.show()\n",
    "wordcloud.to_file('picture/phoneComment.jpg')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "通过上面此图基本可以看出消费者对该款手机的整体评价，但其中可能还有一些刷单的情况，这些量无法控制，没法排除，因此还是要理性的看待一款手机综合性能。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "from pyecharts import Pie\n",
    "from pyecharts import Timeline\n",
    "#attr = [\"衬衫\", \"羊毛衫\", \"雪纺衫\", \"裤子\", \"高跟鞋\", \"袜子\"]\n",
    "attr = phone_type\n",
    "year1 = []\n",
    "year2 = []\n",
    "year3 = []\n",
    "Comment_data['购买时间'] = Comment_data['购买时间'].map(lambda x:x.split(' ')[0].split('-')[0])\n",
    "#print(Comment_data)\n",
    "#year1 = Comment_data[Comment_data['手机型号']==typ\n",
    "data1 = Comment_data[Comment_data['购买时间'] == '2017']\n",
    "data2 = Comment_data[Comment_data['购买时间'] == '2018']\n",
    "data3 = Comment_data[Comment_data['购买时间'] == '2019']\n",
    "#res = pd.concat([df1,df2,df3],axis=0,ignore_index=True)\n",
    "#data = Comment_data[Comment_data['手机型号']=='Apple iPhone 8 Plus 256GB 红色特别版 移动联通电信4G手机']\n",
    "v1 = [len(data1[data1['手机型号']==typ]) for typ in phone_type]\n",
    "v2 = [len(data2[data2['手机型号']==typ]) for typ in phone_type]\n",
    "v3 = [len(data3[data3['手机型号']==typ]) for typ in phone_type]\n",
    "#print(v1)\n",
    "#v2 = [len(Comment_data[Comment_data['手机型号']==typ]& Comment_data['购买时间'] == '2018') for typ in phone_type]\n",
    "#v3 = [len(Comment_data[Comment_data['手机型号']==typ]& Comment_data['购买时间'] == '2019') for typ in phone_type]\n",
    "#print(v1)\n",
    "pie_1 = Pie(\"2017 年手机售出情况\", title_pos='center',background_color='#F0F8FF')\n",
    "pie_1.add(\"\",attr,v1,radius=[40, 75],label_text_color=None,is_label_show=True,legend_orient=\"vertical\",legend_pos=\"left\",is_legend_show=False)\n",
    "pie_2 = Pie(\"2018 年手机售出情况\", title_pos='center',background_color='#F0F8FF')\n",
    "pie_2.add(\"\",attr,v2,radius=[40, 75],label_text_color=None,is_label_show=True,legend_orient=\"vertical\",legend_pos=\"left\",is_legend_show=False)\n",
    "pie_3 = Pie(\"2019 年手机售出情况\", title_pos='center',background_color='#F0F8FF')\n",
    "pie_3.add(\"\",attr,v3,radius=[40, 75],label_text_color=None,is_label_show=True,legend_orient=\"vertical\",legend_pos=\"left\",is_legend_show=False)\n",
    "timeline = Timeline(is_auto_play=True, timeline_bottom=0)\n",
    "timeline.add(pie_1, '2017 年')\n",
    "timeline.add(pie_2, '2018 年')\n",
    "timeline.add(pie_3, '2019 年')\n",
    "timeline\n",
    "#"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Referer反爬虫测试"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import requests\n",
    "\n",
    "url = \"https://sclub.jd.com/comment/productPageComments.action?callback=fetchJSON_comment98vv16247&productId=100000177760&score=0&sortType=5&page=6&pageSize=10&isShadowSku=0&rid=0&fold=1\"\n",
    "headers = {\n",
    "    'Accept': '*/*',\n",
    "    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36',\n",
    "    'Referer':\"https://item.jd.com/100000177760.html#comment\"}\n",
    "r = requests.get(url,headers=headers)\n",
    "print(r.text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "name = \"\\xca\\xd6\\xbb\\xfa\\xba\\xdc\\xba\\xc3\\xa3\\xac\\xd0\\xc2\\xca\"\n",
    "print(name.encode().decode('unicode_escape'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "anaconda-cloud": {},
  "kernelspec": {
   "display_name": "Python [conda root]",
   "language": "python",
   "name": "conda-root-py"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
